This project involves the use of the Nightingale metabolomic panel and Olink inflammation panel to analyze metabolomic/proteomic differences between those with TB, those with DM, and those with both TB and DM. Some of the TB and TB-DM samples were also measured at baseline and after 2 months of treatment for TB.
Code
# Loading packages.library(psych)library(DESeq2)library(ggrepel)library(RColorBrewer)library(pheatmap)library(limma)library(volcano3D)#library(volcano3Ddata)library(readxl)library(EnvStats)library(ggcorrplot)library(knitr)library(factoextra)library(OlinkAnalyze)library(table1)library(tidygraph)library(igraph)library(ggraph)library(foreign)library(haven)library(iglu)library(openxlsx)#library(CorLevelPlot)library(flashClust)library(cowplot)library(impute)library(WGCNA)library(reshape2)library(fs)library(tidyverse)# Setting path variable to location of this analysis.path1 <-"/Users/julia/Library/CloudStorage/OneDrive-Radboudumc/Data_analyses/Julia_TANDEM/"# Setting working directory to path variable.setwd(path1)knitr::opts_chunk$set(message = F, warning = F)
Metadata
The initial list of 352 samples that was provided to for Olink measurements was the basis for merging to other data (template Samplemanifest_TANDEM_JB_T96_x plt.xlsx)
We also used a separate plate layout document to obtain the Sample IDs that were provided to Nightingale (Plate layout TANDEM Olink JB.xlsx)
We obtained the corresponding Patient IDs and timepoints from the PLASMA sheet of DATA SAMPEL TANDEM 2021.xlsx
We obtained the phenotype and HbA1c from a file provided by Ajie (211209_TANDEM_sample_list)
Code
# Reading in olink plate layout document from H drive for all 352 samples.olink_layout <-read_excel(path(path1, "data/template Samplemanifest_TANDEM_JB_T96_x plt.xlsx"), col_types ="text") |>select(SampleID, PlateID, WellID, PatientID = Subject_ID) |>filter(!str_detect(WellID, "12"))# Reading in plate layout document final sample list sheet.final_samplelist <-read_excel(path(path1, "data/Plate layout TANDEM Olink JB.xlsx"), sheet =3, col_types ="text") |>select(SampleID = IdPlasma,PatientID = study_id,PlateID =`Location Olink plates`,PlateNumID =`Olink plate number`,ShipmentID =`IdSample shipment`)# Confirming that the SampleID column is identical in both files (the missing value in final_samplelist became a 0 in olink file).# tibble(ID1 = olink_layout$SampleID, ID2 = final_samplelist$SampleID) |># filter(ID1 != ID2 | is.na(ID1) | is.na(ID2))# Confirming that the PatientID columns are identical in both files except for missing values in final_samplelist. For missing values the olink document has the ShipmentID instead of PatientID.# tmp <- tibble(ID1 = olink_layout$Subject_ID, ID2 = final_samplelist$PatientID) |># filter(ID1 != ID2 | is.na(ID1) | is.na(ID2))# # tmp2 <- olink_layout |># filter(Subject_ID %in% tmp$ID1) |># left_join(final_samplelist, by = "SampleID")# Checking for duplicate sampleID - 1 sample was duplicated (ID 11101416). We can rely on the original metadata to clarify which patientID/sampleID combo was correct and exclude only the incorrect one. The tube itself had the correct SampleID on it.# table(final_samplelist$SampleID)[which(!table(final_samplelist$SampleID) == 1)]# Combining files.all_samples <- olink_layout |>select(-PatientID) |>cbind(final_samplelist |>select(-SampleID, -PlateID)) |>select(SampleID, PatientID, PlateNumID, ShipmentID, OlinkWellID = WellID, OlinkPlateID = PlateID)# Reading in the basic metadata file that was sent with the samples from Indonesia.original_metadata <-read_excel(path(path1, "data/DATA SAMPEL TANDEM 2021.xlsx"), sheet ="PLASMA", col_types ="text")# Merging both files, this gives the correct patientID for each sampleID and also gives the timepoint.all_samples_meta <- all_samples |>left_join(original_metadata, by =c("SampleID"="IdPlasma")) |>mutate(SampleID =case_when(SampleID =="11101416"& PatientID != IdPatient ~"Exclude", SampleID =="0"~"Exclude",TRUE~ SampleID),Timepoint =case_when(SampleID =="Exclude"~NA_character_, Visit =="1"~"Baseline", Visit %in%c("3", "4") ~"Month_2"),PatientID =case_when(SampleID =="Exclude"~NA_character_,TRUE~ IdPatient),Location =case_when(SampleID =="Exclude"~NA_character_,TRUE~ Location),ShipmentID =case_when(SampleID =="Exclude"~NA_character_,TRUE~ ShipmentID)) |>select(SampleID:OlinkPlateID, Timepoint, Location)# Obtaining phenotype and hba1c from one of the rds files made by Ajie.samplelist_rds <-read_rds(path(path1, "data/211209_TANDEM_sample_list")) |>filter(study_id %in% all_samples_meta$PatientID) |>select(PatientID = study_id, Phenotype = phenotype, HbA1c = lab_hba1c_result) |>mutate(Phenotype =case_when(Phenotype =="TB-diabetes"~"TB-DM", Phenotype =="TB-no-diabetes"~"TB", Phenotype =="no-TB-diabetes"~"DM"),HbA1c_categories =case_when(HbA1c <5.7~"Normal (<5.7%)", HbA1c >=5.7& HbA1c <6.4~"Elevated (5.7%-6.4%)", HbA1c >=6.5& HbA1c <10~"Diabetes (>6.5%-9.9%)", HbA1c >=10~"Diabetes (>10%)"))# Obtaining age, sex, bmi, medication, diabetes duration, and TB culture results from the other rds file made by Ajie.# Extracting the columns of interest.metadata_rds <-read_rds(path(path1, "data/211209_TANDEM_ComprehensiveMetadata")) |>filter(study_id %in% all_samples_meta$PatientID) |>select(PatientID = study_id, Age = age_b, Sex = sex, Weight = bmi_weight, Height = bmi_height, redcap_event_name, Insulin1 = dm_drugs___1, Metformin1 = dm_drugs___2, Sulphonurea = dm_drugs___3, Insulin2 = dm_meds___1, Metformin2 = dm_meds___2, Statin = dm_meds___5, Steroids1 = started_steroids, Steroids2 = steroids_1, DM_duration = dm_when, cul_res_o, cul_res_o2, cul_res_o_3)# Cleaning up drug information.drugs <- metadata_rds |>group_by(PatientID) |>summarize(Insulin =sum(Insulin1 =="Checked"| Insulin2 =="Checked", na.rm = T),Metformin =sum(Metformin1 =="Checked"| Metformin2 =="Checked", na.rm = T),Sulphonurea =sum(Sulphonurea =="Checked", na.rm = T),Statin =sum(Statin =="Checked", na.rm = T),Steroids_Yes =sum(Steroids1 =="Ya"| Steroids2 =="Ya", na.rm = T),Steroids_No =sum(Steroids1 =="Tidak"| Steroids2 =="Tidak", na.rm = T)) |>mutate(Insulin =case_when(Insulin ==0~"No",TRUE~"Yes"),Metformin =case_when(Metformin ==0~"No",TRUE~"Yes"),Sulphonurea =case_when(Sulphonurea ==0~"No",TRUE~"Yes"),Statin =case_when(Statin ==0~"No",TRUE~"Yes"),Steroids =case_when(Steroids_Yes >0~"Yes", Steroids_No >0~"No",TRUE~"Unknown")) |>select(-(Steroids_Yes:Steroids_No))# Cleaning up TB culture information.culture <- metadata_rds |>mutate(Timepoint =case_when(is.na(redcap_event_name) | redcap_event_name =="0 month"~"Baseline_Culture",str_detect(redcap_event_name, "^2 month") ~"Month_2_Culture",str_detect(redcap_event_name, "^6 month") ~"Month_6_Culture",str_detect(redcap_event_name, "^12 month") ~"Month_12_Culture",str_detect(redcap_event_name, "^18 month") ~"Month_18_Culture")) |>filter(!is.na(Timepoint)) |>group_by(PatientID, Timepoint) |>summarize(Positive =sum(cul_res_o =="Positive"| cul_res_o2 =="Positive"| cul_res_o_3 =="Positive", na.rm = T),Negative =sum(cul_res_o =="Negative"| cul_res_o2 =="Negative"| cul_res_o_3 =="Negative", na.rm = T)) |>mutate(Culture_Result =case_when(Positive >0~"Positive", Negative >0~"Negative",TRUE~"Unknown")) |>select(-Positive, -Negative) |>pivot_wider(names_from = Timepoint, values_from = Culture_Result)# Keeping all other columns and identifying duplicated samples.metadata_rds2 <- metadata_rds |>filter(is.na(redcap_event_name) |str_detect(redcap_event_name, "^0")) |>select(PatientID:redcap_event_name, DM_duration)dup_meta_rds <- metadata_rds2 |>filter(duplicated(PatientID) |duplicated(PatientID, fromLast = T), redcap_event_name =="0 month")# Excluding duplicated samples (preferentially keeping data from TB database), merging drug/culture information, then cleaning up the columns and calculating BMI.metadata_rds_dedup <- metadata_rds2 |>filter(!PatientID %in% dup_meta_rds$PatientID) |>rbind(dup_meta_rds) |>select(-redcap_event_name) |>left_join(drugs, by ="PatientID") |>left_join(culture, by ="PatientID") |>mutate(DM_duration_years =case_when(DM_duration =="< 12 bulan"~"< 1", DM_duration =="1-5 tahun"~"1 - 5", DM_duration =="6-15 tahun"~"6 - 15", DM_duration =="15+ tahun"~"> 15"),Age =round(Age, 1),Weight =round(Weight, 1),BMI = Weight / ((Height /100) ^2),BMI_categories =case_when(BMI <18.5~"Underweight (<18.5)", BMI >=18.5& BMI <23.0~"Normal (18.5-22.9)", BMI >=23.0& BMI <25.0~"Overweight (23.0-24.9)", BMI >=25~"Obese (>=25)")) |>select(-DM_duration)# Obtaining smoking status, waist-to-hip ratio, haemoglobin, IGRA, and comorbidity data from DM spreadsheet.dm_excel <-read_excel(path(path1, "data/Sample ID and DM patients data merged.xlsx"), col_types ="text") |>filter(study_id %in% all_samples_meta$PatientID) |>select(PatientID = study_id, Smoker = smokes, Waist = waist, Hip = hip, Haemoglobin = hb_result, IGRA = qft_result, IGRA2 = igra_status, dcangina, dcstroke, dcheartattack, dcheartbypass, eyecataract, eyeglaucoma, eyeblind, eyesight, Kidney_Failure = renal, Foot_amputation = dclost_limb) |>mutate(Smoker =case_when(Smoker =="Tidak sama sekali"~"No", Smoker =="Setiap hari"~"Yes", Smoker =="Kurang dari setiap hari"~"Yes"),Waist =round(as.numeric(Waist), 1),Hip =round(as.numeric(Hip), 1),Haemoglobin =round(as.numeric(Haemoglobin), 1),Waist_to_Hip_Ratio = Waist / Hip,Angina =case_when(dcangina =="tidak"~"No", dcangina =="ya"~"Yes", dcangina =="tidak tahu"~"Unknown"),Stroke =case_when(dcstroke =="tidak"~"No", dcstroke =="ya"~"Yes", dcstroke =="tidak tahu"~"Unknown"),Heart_Attack =case_when(dcheartattack =="tidak"~"No", dcheartattack =="ya"~"Yes", dcheartattack =="tidak tahu"~"Unknown"),Heart_Bypass =case_when(dcheartbypass =="tidak"~"No", dcheartbypass =="ya"~"Yes", dcheartbypass =="tidak tahu"~"Unknown"),Eye_Cataract =case_when(eyecataract =="tidak"~"No", eyecataract =="ya"~"Yes", eyecataract =="tidak tahu"~"Unknown"),Eye_Glaucoma =case_when(eyeglaucoma =="tidak"~"No", eyeglaucoma =="ya"~"Yes", eyeglaucoma =="tidak tahu"~"Unknown"),Eye_Blind =case_when(eyeblind =="tidak"~"No", eyeblind =="ya"~"Yes", eyeblind =="tidak tahu"~"Unknown"),Eye_Sight =case_when(eyesight =="tidak"~"No", eyesight =="ya"~"Yes", eyesight =="tidak tahu"~"Unknown"),Kidney_Failure =case_when(Kidney_Failure =="tidak"~"No", Kidney_Failure =="ya"~"Yes", Kidney_Failure =="tidak tahu"~"Unknown"),Foot_amputation =case_when(Foot_amputation =="tidak"~"No", Foot_amputation =="ya"~"Yes")) |>select(-(IGRA2:eyesight))# Checking which IGRA column to keep above (IGRA column had all information as in IGRA2).# tmp <- dm_excel |> # filter(IGRA != IGRA2 | is.na(IGRA) | is.na(IGRA2))# Obtaining smoking status, waist-to-hip ratio, haemoglobin, and eGFR from TB spreadsheet.tb_excel <-read_excel(path(path1, "data/Sample ID and TB patients data merged.xlsx"), col_types ="text") |>filter(study_id %in% all_samples_meta$PatientID) |>select(PatientID = study_id, Smoker = smokes, Waist = waist, Hip = hip, Haemoglobin = hb_result, eGFR = calc_egfr) |>mutate(Smoker =case_when(Smoker =="Tidak sama sekali"~"No", Smoker =="Setiap hari"~"Yes", Smoker =="Kurang dari setiap hari"~"Yes"),Waist =round(as.numeric(Waist), 1),Hip =round(as.numeric(Hip), 1),Haemoglobin =round(as.numeric(Haemoglobin), 1),Waist_to_Hip_Ratio = Waist / Hip)# Obtaining MTB strain and drug resistance information from WGS filewgs_data <-read_excel(path(path1, "data/WGS_data_from Carolien.xlsx"), sheet ="Sheet1", col_types ="text") |>mutate(Resistance =case_when(rifampicin !="-"& isoniazid !="-"~"Both", rifampicin !="-"~"Rifampicin", isoniazid !="-"~"Isoniazid",TRUE~"None")) |>select(PatientID = stnum, MTB_strain = name, Resistance) |>filter(PatientID %in% all_samples_meta$PatientID)# Obtaining CXR score and death data from Alit's metadata.alit_table <-read_xls(path(path1, "data/Tandem Olink Metadata_Timika_Outcomes.xls"), col_types ="text") |>select(PatientID = study_id, Timika_score = timika_score, Withdraw_reason = withdraw_reason) |>mutate(Timika_score =as.numeric(Timika_score),Death =case_when(Withdraw_reason =="Death"~"Yes",TRUE~"Unknown")) |>select(-Withdraw_reason)# Combining all information into a final metadata table and cleaning up columns.all_samples_meta_final <- all_samples_meta |>left_join(samplelist_rds, by ="PatientID") |>left_join(metadata_rds_dedup, by ="PatientID") |>left_join(alit_table, by ="PatientID") |>left_join(wgs_data, by ="PatientID") |>left_join(tb_excel, by ="PatientID") |>left_join(dm_excel, by ="PatientID") |>mutate(Smoker =case_when(!is.na(Smoker.x) ~ Smoker.x,TRUE~ Smoker.y),Waist =case_when(!is.na(Waist.x) ~ Waist.x,TRUE~ Waist.y),Hip =case_when(!is.na(Hip.x) ~ Hip.x,TRUE~ Hip.y),Haemoglobin =case_when(!is.na(Haemoglobin.x) ~ Haemoglobin.x,TRUE~ Haemoglobin.y),Waist_to_Hip_Ratio =case_when(!is.na(Waist_to_Hip_Ratio.x) ~ Waist_to_Hip_Ratio.x,TRUE~ Waist_to_Hip_Ratio.y),Treatment_Failure =case_when(Death =="Yes"| Month_6_Culture =="Positive"| Month_12_Culture =="Positive"| Month_18_Culture =="Positive"~"Yes",TRUE~"No")) |>select(-Smoker.x, -Waist.x, -Hip.x, -Haemoglobin.x, -Waist_to_Hip_Ratio.x, -Smoker.y, -Waist.y, -Hip.y, -Haemoglobin.y, -Waist_to_Hip_Ratio.y, -Location)# Validating TB phenotype (note that 11 patients had negative baseline cultures, but each appears to have a positive smear result or CXR score). Added a column for which samples this was.tmp <- all_samples_meta_final |>filter(Phenotype %in%c("TB", "TB-DM"), Baseline_Culture =="Negative")# tmp2 <- read_rds(path("data/211209_TANDEM_ComprehensiveMetadata")) |># filter(study_id %in% tmp$PatientID) |># select(study_id, redcap_event_name, starts_with("smear_res"), starts_with("cul_res_o"))all_samples_meta_final <- all_samples_meta_final |>mutate(Double_Check_TB =case_when(PatientID %in% tmp$PatientID ~"Check",TRUE~"Culture_Positive"))# Validating DM phenotype.# tmp <- all_samples_meta_final |> # filter(Phenotype %in% c("DM", "TB-DM"),# HbA1c_categories != "Diabetes (>6.5%)")# # tmp <- all_samples_meta_final |> # filter(Phenotype %in% c("TB"),# HbA1c_categories == "Diabetes (>6.5%)")# Validating Treatment_Failure column.# tmp <- all_samples_meta_final |> # filter(Treatment_Failure == "Yes")# Filtering out missing values for timepoint. has_na <-is.na(all_samples_meta_final$Timepoint)all_samples_meta_new <- all_samples_meta_final[!has_na, ]rownames(all_samples_meta_new) <-NULL# Cleaning up R environment.rm(alit_table, all_samples, all_samples_meta, culture, dm_excel, drugs, dup_meta_rds, final_samplelist, metadata_rds, metadata_rds_dedup, metadata_rds2, olink_layout, original_metadata, samplelist_rds, tb_excel, tmp, wgs_data)
> Olink
Inflammatory protein concentrations in the plasma of these individuals were measured using the Olink Target 96 Inflammation panel (v3.024). This includes a total of 92 inflammatory proteins. Julia Brake used the in-house Olink machine for this, performed by Liesbeth van Emst.
All proteins in this experiment were normalized using the Intensity Normalization (v2) method which normalized all values between plates such that they end up with the same median value for each plate. LOD is calculated per plate based on the negative control readings for each protein + 3SD. There were also sample controls on each plate. For calculating differences, one can simply subtract the normalized protein expression (NPX) in each group and this is the equivalent of a log2FC (as NPX are on a log2 scale).
Reading data + attaching metadata
Code
# Reading in the data and cleaning up names of proteins.olink_data <-read_NPX(path(path1, "data/TANDEM_NPX.xlsx")) |>mutate(Assay =case_when(Assay =="IL8"~"IL-8", Assay =="IL7"~"IL-7", Assay =="LAP TGF-beta-1"~"LAP TGF-β1", Assay =="IL6"~"IL-6", Assay =="IL-1 alpha"~"IL-1α", Assay =="IL2"~"IL-2", Assay =="IL18"~"IL-18", Assay =="TGF-alpha"~"TGFα", Assay =="IL-22 RA1"~"IL-22RA1", Assay =="Beta-NGF"~"β-NGF", Assay =="IL13"~"IL-13", Assay =="IL10"~"IL-10", Assay =="Flt3L"~"FLT3L", Assay =="IL33"~"IL-33", Assay =="IFN-gamma"~"IFNγ", Assay =="IL4"~"IL-4", Assay =="IL5"~"IL-5", Assay =="TNFB"~"LTα",TRUE~ Assay))# Attaching metadata and excluding any samples missing a phenotype (removes excluded samples and sample controls).olink_data_meta <- olink_data |>left_join(all_samples_meta_final, by ="SampleID") |>filter(!is.na(Phenotype))
Quality control
Olink data quality control is largely done prior to obtaining the file. Any cells in the excel file with no values are cells that failed to produce data or cells that failed QC and were manually excluded by the Olink analyst.
Reading online (https://olink.com/faq/what-does-it-mean-if-samples-are-flagged-in-the-qc/) it seems that the term “Warning” really means “Fail” for the QC_Warning column (due to an internal control on the plate such as the incubation or detection control being outside of the accepted range). It therefore makes sense to remove all samples that received a QC warning. However, having spoken to the Olink analyst, she already excluded samples that had sufficiently deviating values across all measured proteins, and these all have blank cells in the output file, therefore it is recommended that we should use all samples unless they substantially deviate when plotting a PCA (or IQR vs median plot). Also, the best practice in other fields is to set all values below the LOD to the LOD itself, however we have been informed this is essentially unnecessary with Olink data as the range of data below the LOD is extremely small and this will make essentially no difference to the overall results.
Code
# Identifying samples that are missing values.missing_data <- olink_data_meta |>group_by(SampleID) |>summarize(N_NA =sum(is.na(NPX))) |>filter(N_NA >0)# Excluding samples with all missing values.olink_data_meta_qc1 <- olink_data_meta |>filter(!SampleID %in% (missing_data |>filter(N_NA ==92) |>pull(SampleID)))# Visualizing potential outliers by IQR vs sample median. Clearly two samples are outliers and should be removed (11101749 and 11101239).olink_qc_plot(olink_data_meta_qc1)
# Code for removing all samples with QC warning and setting all values below the LOD to the LOD itself. Note this was not used in the end.# olink_data_meta_qc <- olink_data_meta |> # filter(QC_Warning == "Pass") |> # mutate(NPX = case_when(NPX < LOD ~ LOD,# TRUE ~ NPX))
Next, percentage of samples below the limit of detection per group for each assay is calculated and the difference in this percentage between phenotype groups/timepoints is observed. If there is no obvious difference between groups then all proteins with > 25% samples below LOD will be excluded. Note that this is sometimes referred to as missingness, but this is misleading as there is a value provided, it was simply below the LOD.
Based on the below output, all assays that have more than 25% of samples below LOD in all groups are excluded, which removes 15 of the 92 proteins.
Code
# Calculating percentage below LOD per assay for each phenotype/timepoint.below_LOD_data <- olink_data_meta_qc2 |>mutate(below_LOD = NPX < LOD) |>group_by(Phenotype, Timepoint, Assay) |>summarize(below_LOD_percent =sum(below_LOD, na.rm = T) /n() *100)# Identifying all Assays with at least one group over 25% below LOD.over_25 <- below_LOD_data |>group_by(Assay) |>summarize(N_over25 =sum(below_LOD_percent >25)) |>filter(N_over25 >0)# Plotting LOD data.below_LOD_data |>filter(Assay %in% over_25$Assay) |>mutate(Assay =factor(Assay),Assay =reorder(Assay, below_LOD_percent, decreasing = T)) |>ggplot(aes(x = Assay, y = below_LOD_percent, color = Phenotype, shape = Timepoint)) +geom_hline(yintercept =25, color ="darkred", linetype =2) +geom_point(size =2) +scale_y_continuous(expand =c(0, 0), limits =c(0, 100)) +scale_color_manual(values =c("goldenrod3", "blue4", "deeppink3")) +labs(y ="Below LOD (%)") +theme_bw() +theme(axis.text.x =element_text(angle =45, hjust =1),axis.title.x =element_blank(),panel.grid.minor.y =element_blank())
Code
ggsave(filename =path(path1, "output/Olink_LODs.png"), height =5, width =7, units ="in")# Excluding assays with more than 25% of samples below LOD in all 5 phenotype groups.olink_data_meta_qc_final <- olink_data_meta_qc2 |>filter(!Assay %in% (over_25 |>filter(N_over25 ==5) |>pull(Assay)))# Cleaning up R environment.rm(below_LOD_data, missing_data, olink_data, olink_data_meta, olink_data_meta_qc1, olink_data_meta_qc2, over_25)
# stratified by sex # filter for baseline samples (because only one level of nesting is supported in table) baseline_samples <- unique_patients_Olink %>%filter(Timepoint =="Baseline")tableSex <-table1(~ Waist_to_Hip_Ratio + Haemoglobin | Phenotype + Sex, data = baseline_samples)tableSex
DM
TB
TB-DM
Overall
Female (N=39)
Male (N=57)
Female (N=44)
Male (N=49)
Female (N=41)
Male (N=50)
Female (N=124)
Male (N=156)
Waist_to_Hip_Ratio
Mean (SD)
0.890 (0.0571)
0.933 (0.0489)
0.806 (0.0610)
0.822 (0.0576)
0.860 (0.0660)
0.877 (0.0652)
0.850 (0.0703)
0.880 (0.0729)
Median [Min, Max]
0.887 [0.771, 0.989]
0.933 [0.821, 1.06]
0.797 [0.684, 0.922]
0.813 [0.704, 0.989]
0.858 [0.691, 0.979]
0.879 [0.759, 1.05]
0.855 [0.684, 0.989]
0.887 [0.704, 1.06]
Haemoglobin
Mean (SD)
13.1 (1.61)
14.6 (1.84)
11.7 (1.56)
12.8 (1.60)
12.2 (1.60)
13.5 (1.75)
12.3 (1.69)
13.7 (1.89)
Median [Min, Max]
13.2 [7.90, 16.2]
14.5 [9.50, 18.2]
11.9 [6.80, 14.3]
12.9 [7.70, 15.8]
12.1 [8.80, 15.0]
13.7 [7.30, 17.2]
12.6 [6.80, 16.2]
13.9 [7.30, 18.2]
Metadata differences
Code
# use only baseline samples baseline_samples <- unique_patients_Olink %>%filter(Timepoint =="Baseline")# display 15 decimals of p-value resultoptions(digits =15)
write_csv(tmp, file =path(path1, "output/baseline_all_median.csv"), col_names = F)# Filtering to exclude measurements of biomarkers that didn't have both timepoints for a patient.both_timepoint <- olink_data_meta_qc_final |>select(PatientID, Assay, Timepoint) |>unique() |>group_by(PatientID, Assay) |>summarize(N =n()) |>filter(N ==2)filtered_data <- olink_data_meta_qc_final |>left_join(both_timepoint, by =c("PatientID", "Assay")) |>filter(N ==2) |>select(-N)tmp <- filtered_data |>rbind(olink_data_meta_qc_final |>filter(Phenotype =="DM")) |>select(SampleID, Phenotype, Timepoint, Assay, NPX) |>group_by(Phenotype, Timepoint, Assay) |>summarize(Median =median(NPX, na.rm = T)) |>pivot_wider(names_from = Assay, values_from = Median) |>t() |>as.data.frame() |>rownames_to_column("Assay")write_csv(tmp, file =path(path1, "output/paired_all_median.csv"), col_names = F)
Associations with metadata
This analysis uses a series of linear regression to test the association of the scaled NPX values with scaled variables (Age, BMI, Sex, Timika score, Haemoglobin, HbA1c, Month 6 sputum culture result). The plot then uses the estimate (absolute effect size) for each predictor on the outcome protein as the size of the square and the color is based on the FDR-multiple-testing corrected p-value of the association.
Kruskal Wallis non-parametric test is used instead of the default one-way ANOVA of the polar_coords function to generate a three-way Volcano plot.
Code
# Make data frame with adj. p-values from previous Wilcox tests for each comparison. # the first column is a group test such as one-way ANOVA or Kruskal-Wallis test.# columns 2-4 contain adj. p-values one for each comparison in the sequence A vs B, A vs C, B vs C. # A = TB, B = DM, C = TB-DM# Perform Kruskal Wallis. olink_data_meta_qc_final$Phenotype <-ordered(olink_data_meta_qc_final$Phenotype,levels =c("TB", "DM", "TB-DM"))kruskal_results <- olink_data_meta_qc_final |>filter(Timepoint =="Baseline",!is.na(NPX)) |>group_by(Assay) |>summarise(p_value =kruskal.test(NPX ~ Phenotype)$p.value)kruskal_results <- kruskal_results |>mutate(adj_p_value =p.adjust(p_value, method ="BH"))print(kruskal_results)
ggsave(filename =path(path1, "output/2DVolcano_Olink.svg"), height =7, width =7, units ="in")
> Nightingale
The same 352 samples were sent to Nightingale for metabolomic analysis. Nightingale provides these results in a single Excel file which also has all QC related information.
Quality control
Nightingale performs a large amount of QC prior to sending the results. For sample QC, they don’t provide measurements for any sample that they excluded, such as those with solid material in the tube.
Samples which failed the Nightingale QC and biomarkers with > 25% below LOD were excluded. This excludes a total of 21 samples and 2 biomarkers.
Code
# Reading in raw Nightingale results file and adding metadata.nightingale <-read_excel(path(path1, "data/Nightingale_Results_uncleaned.xlsx"), sheet =1, skip =10) |>select(-(`...1`:`...2`)) |>slice(-(1:4)) |>rename(Sample =`Sample id`) |>pivot_longer(!Sample, names_to ="Biomarker", values_to ="Concentration") |>left_join(all_samples_meta_final, by =c("Sample"="PlateNumID")) |>filter(SampleID !="Exclude")# Reading in sample QC sheet and adding to master file. In total 21 samples were flagged for exclusion by Nightingale. Resulting N = 331.sample_qc <-read_excel(path(path1, "data/Nightingale_Results_uncleaned.xlsx"), sheet =2, skip =11) |>select(-`Sample has remarks`) |>slice(-(1:3)) |>rename(Sample =`...2`) |>filter(`Sample excluded`=="x")nightingale_qc <- nightingale |>left_join(sample_qc, by ="Sample") |>filter(is.na(`Sample excluded`))# Reading in biomarker QC sheet and adding to master file.biomarker_qc <-read_excel(path(path1, "data/Nightingale_Results_uncleaned.xlsx"), sheet =3, skip =10) |>select(-`...1`) |>slice(-(1:4)) |>rename(Sample =`Sample id`) |>pivot_longer(cols =`Total-C`:`S-HDL-TG %`, names_to ="Biomarker", values_to ="QC") |>filter(!is.na(QC))nightingale_qc2 <- nightingale_qc |>left_join(biomarker_qc, by =c("Sample", "Biomarker"))#table(nightingale_qc2$QC)# Calculating percentage below LOD per assay for each phenotype/timepoint.below_LOD_data <- nightingale_qc2 |>mutate(below_LOD = QC =="Below limit of quantification") |>group_by(Phenotype, Timepoint, Biomarker) |>summarize(below_LOD_percent =sum(below_LOD, na.rm = T) /n() *100)# Identifying all Assays with at least one group over 25% below LOD.over_25 <- below_LOD_data |>group_by(Biomarker) |>summarize(N_over25 =sum(below_LOD_percent >25)) |>filter(N_over25 >0)# Plotting LOD data.below_LOD_data |>filter(Biomarker %in% over_25$Biomarker) |>mutate(Biomarker =factor(Biomarker),Biomarker =reorder(Biomarker, below_LOD_percent, decreasing = T)) |>ggplot(aes(x = Biomarker, y = below_LOD_percent, color = Phenotype, shape = Timepoint)) +geom_hline(yintercept =25, color ="darkred", linetype =2) +geom_point(size =2) +scale_y_continuous(expand =c(0, 0), limits =c(0, 100)) +scale_color_manual(values =c("goldenrod3", "blue4", "deeppink3")) +labs(y ="Below LOD (%)") +theme_bw() +theme(axis.text.x =element_text(angle =45, hjust =1),axis.title.x =element_blank(),panel.grid.minor.y =element_blank())
Code
# Excluding assays with more than 25% of samples below LOD in all 5 phenotype groups.nightingale_qc_final <- nightingale_qc2 |>filter(!Biomarker %in% (over_25 |>filter(N_over25 ==5) |>pull(Biomarker)))# Cleaning up R environment.rm(below_LOD_data, sample_qc, biomarker_qc, nightingale, nightingale_qc, nightingale_qc2, over_25)# Converting Concentration column to numeric and checking which remaining values become NA. Only 27 measurements are NA and all had a QC warning of some type. This is fine.tmp <- nightingale_qc_final |>mutate(numeric_conc =as.numeric(Concentration)) |>filter(is.na(numeric_conc)) |>select(Sample:Concentration, QC)nightingale_qc_final <- nightingale_qc_final |>mutate(Concentration =as.numeric(Concentration)) |>filter(!is.na(Concentration))
Using linear regression to test the association of the scaled Concentration values with scaled Age, Sex, BMI, and HbA1c. The plot then just uses the estimate for each predictor on the outcome protein as the size of the square and the color is based on the FDR corrected p-value of the association.
Kruskal Wallis non-parametric test is used instead of the default one-way ANOVA of the polar_coords function to generate a three-way Volcano plot.
Code
# Make data frame with adj. p-values from previous Wilcox tests for each comparison. # the first column is a group test such as one-way ANOVA or Kruskal-Wallis test.# columns 2-4 contain adj. p-values one for each comparison in the sequence A vs B, A vs C, B vs C. # A = TB, B = DM, C = TB-DM # Perform Kruskal Wallis. nightingale_qc_final$Phenotype <-ordered(nightingale_qc_final$Phenotype,levels =c("TB", "DM", "TB-DM"))kruskal_results <- nightingale_qc_final |>filter(Biomarker %in% Julias_selection, Timepoint =="Baseline") |>group_by(Biomarker) |>summarise(p_value =kruskal.test(Concentration ~ Phenotype)$p.value)kruskal_results <- kruskal_results |>mutate(adj_p_value =p.adjust(p_value, method ="BH"))print(kruskal_results)
ggsave(filename =path(path1, "output/2DVolcano_Nightingale.svg"), height =6, width =5, units ="in")
> Integration
Correlation Olink + Nightingale
Correlate inflammatory protein marker and lipid marker measurements.
Code
# Subset Olink dataframe for baseline.Olink <- olink_data_meta_qc_final |>filter(Timepoint =="Baseline") |>select(SampleID, Phenotype, Assay, NPX) |>pivot_wider(names_from = Assay, values_from = NPX)# Subset Nightingale dataframe.NG <- nightingale_qc_final |>filter(Timepoint =="Baseline", Biomarker %in% Julias_selection) |>select(SampleID, Phenotype, Biomarker, Concentration) |>pivot_wider(names_from = Biomarker, values_from = Concentration)# Merge inflammatory proteins and lipid measurements based on the same SampleIDs.Combi <-merge(Olink, NG, by ="SampleID", all.x =TRUE)table(Combi$Phenotype.y) # --> phenotype x is from Olink and y is from Nightingale
TB DM TB-DM
90 92 82
Code
# Subset Olink dataframe for Month 2Olink <- olink_data_meta_qc_final |>filter(Timepoint =="Month_2") |>select(SampleID, Phenotype, Assay, NPX) |>pivot_wider(names_from = Assay, values_from = NPX)# Subset Nightingale dataframe.NG <- nightingale_qc_final |>filter(Timepoint =="Month_2", Biomarker %in% Julias_selection) |>select(SampleID, Phenotype, Biomarker, Concentration) |>pivot_wider(names_from = Biomarker, values_from = Concentration)# Merge inflammatory proteins and lipid measurements based on the same SampleIDs.CombiM2 <-merge(Olink, NG, by ="SampleID", all.x =TRUE)table(CombiM2$Phenotype.x) # --> phenotype x is from Olink and y is from Nightingale
TB DM TB-DM
33 0 32
Spearman
Code
## Spearman at Baseline for TB-DM alone.#colnames(Combi)tmp <- Combi |>filter(Phenotype.x =="TB-DM") |>select(3:79, all_of(Small_selection_lipids2)) |>drop_na()# filter for comparisons you would like to plot.tmp2 <-cor(tmp, method ="spearman")[colnames(tmp)[3:79], Small_selection_lipids2]custom_colors <-c("blue", "white", "red")ggcorrplot(tmp2,colors = custom_colors,lab =TRUE, # Show correlation coefficients on the plotlab_size =1, # Size of correlation coefficients texttitle ="TB-DM", # Title of the plotggtheme = ggplot2::theme_minimal()) # Minimal theme for cleaner look
ggsave(filename =path(path1, "output/Spearman_Olink_NG_Combi_TBDM.svg"), , plot =last_plot(), height =10, width =3, units ="in")## Spearman at baseline for DM alone. #colnames(Combi)tmp <- Combi |>filter(Phenotype.x =="DM") |>select(3:79, all_of(Small_selection_lipids2)) |>drop_na()# filter for comparisons you would like to plot. tmp2 <-cor(tmp, method ="spearman")[colnames(tmp)[3:79], Small_selection_lipids2]custom_colors <-c("blue", "white", "red")ggcorrplot(tmp2,colors = custom_colors,lab =TRUE, # Show correlation coefficients on the plotlab_size =1, # Size of correlation coefficients texttitle ="DM", # Title of the plotggtheme = ggplot2::theme_minimal()) # Minimal theme for cleaner look
ggsave(filename =path(path1, "output/Spearman_Olink_NG_Combi_DM.svg"), , plot =last_plot(), height =10, width =3, units ="in")## Spearman at Baseline for TB alone.#colnames(Combi)tmp <- Combi |>filter(Phenotype.x =="TB") |>select(3:79, all_of(Small_selection_lipids2)) |>drop_na()# filter for comparisons you would like to plot. tmp2 <-cor(tmp, method ="spearman")[colnames(tmp)[3:79], Small_selection_lipids2]custom_colors <-c("blue", "white", "red")ggcorrplot(tmp2,colors = custom_colors,lab =TRUE, # Show correlation coefficients on the plotlab_size =1, # Size of correlation coefficients texttitle ="TB", # Title of the plotggtheme = ggplot2::theme_minimal()) # Minimal theme for cleaner look